In this case study, we are going to build a classifier to calculate the probability of a customer Income Range.
Each row is about a customer. We have details about their relationship, occupation, education, marital status etc. In class column (target column), we have value <=50K and >=50K,
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, auc, roc_curve
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from statsmodels.formula.api import ols
from sklearn.tree import DecisionTreeRegressor
dt=pd.read_csv('adult.csv')
dt.head()
dt.columns
dt.dtypes
dt.isnull().sum()
dt['class'].unique()
dt.shape
dt['workclass'].unique()
dt['workclass']=dt['workclass'].replace({'?':'Unknown'})
dt.workclass.mode()
dt.workclass.value_counts()
g = sns.factorplot(x='workclass',y='capitalgain',hue='class',data=dt,kind='bar',size=6,palette='muted')
g.despine(left=True)
g = g.set_ylabels('Capital Gain')
plt.xticks(rotation = 45)
Price_month = dt['workclass'].value_counts()
months = pd.DataFrame(data=Price_month.index, columns=["workclass"])
months['values'] = Price_month.values
fig = px.pie(months, values='values', names='workclass', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
def histogram(data,path,color,title,xaxis,yaxis):
fig = px.histogram(data, x=path,color=color)
fig.update_layout(
title_text=title,
xaxis_title_text=xaxis,
yaxis_title_text=yaxis,
bargap=0.2,
bargroupgap=0.1
)
fig.show()
histogram(dt,"workclass","class",'class on workclass','workclass','Count')
df_edu=dt.groupby('class')['workclass'].value_counts(normalize=True)
df_edu = df_edu.mul(100).rename('Percent').reset_index()
df_edu['Percent']=df_edu['Percent'].round(decimals=2)
df_edu.head(10)
px.bar(df_edu, x='class', y='Percent', color='workclass', title="class as yes w.r.t workclass",
barmode='group', text='Percent')
dt_edu = dt['education'].value_counts()
education = pd.DataFrame(data=dt_edu.index, columns=["education"])
education['values'] = dt_edu.values
fig = px.pie(education, values='values', names='education', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
histogram(dt,"education","class",'class on education','education','Count')
df_edu=dt.groupby('class')['education'].value_counts(normalize=True)
df_edu = df_edu.mul(100).rename('Percent').reset_index()
df_edu['Percent']=df_edu['Percent'].round(decimals=2)
df_edu.head(10)
px.bar(df_edu, x='class', y='Percent', color='education', title="class w.r.t education",
barmode='group', text='Percent')
dt_marr = dt['marital-status'].value_counts()
marriage = pd.DataFrame(data=dt_marr.index, columns=["marital-status"])
marriage['values'] = dt_marr.values
fig = px.pie(marriage, values='values', names='marital-status', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
histogram(dt,"marital-status","class",'class on marital-status','marital-status','Count')
df_married=dt.groupby('class')['marital-status'].value_counts(normalize=True)
df_married = df_married.mul(100).rename('Percent').reset_index()
df_married['Percent']=df_edu['Percent'].round(decimals=2)
df_married.head(10)
px.bar(df_married, x='class', y='Percent', color='marital-status', title="class w.r.t marital-status",
barmode='group', text='Percent')
dt['occupation']=dt['occupation'].replace({'?':'Unknown'})
dt_occu = dt['occupation'].value_counts()
occupation = pd.DataFrame(data=dt_occu.index, columns=["occupation"])
occupation['values'] = dt_occu.values
fig = px.pie(occupation, values='values', names='occupation', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
histogram(dt,"occupation","class",'class on occupation','occupation','Count')
df_occu=dt.groupby('class')['occupation'].value_counts(normalize=True)
df_occu = df_occu.mul(100).rename('Percent').reset_index()
df_occu['Percent']=df_occu['Percent'].round(decimals=2)
df_occu.head(10)
px.bar(df_occu, x='class', y='Percent', color='occupation', title="class w.r.t occupation",
barmode='group', text='Percent')
dt_rel = dt['relationship'].value_counts()
relationship = pd.DataFrame(data=dt_rel.index, columns=["relationship"])
relationship['values'] = dt_rel.values
fig = px.pie(relationship, values='values', names='relationship', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
histogram(dt,"relationship","class",'class on relationship','relationship','Count')
df_rel=dt.groupby('class')['relationship'].value_counts(normalize=True)
df_rel = df_rel.mul(100).rename('Percent').reset_index()
df_rel['Percent']=df_rel['Percent'].round(decimals=2)
df_rel.head(10)
px.bar(df_rel, x='class', y='Percent', color='relationship', title="class w.r.t relationship",
barmode='group', text='Percent')
dt_race = dt['race'].value_counts()
race = pd.DataFrame(data=dt_race.index, columns=["race"])
race['values'] = dt_race.values
fig = px.pie(race, values='values', names='race', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
histogram(dt,"race","class",'class on race','race','Count')
df_race=dt.groupby('class')['race'].value_counts(normalize=True)
df_race = df_race.mul(100).rename('Percent').reset_index()
df_race['Percent']=df_edu['Percent'].round(decimals=2)
df_race.head(10)
px.bar(df_race, x='class', y='Percent', color='race', title="class w.r.t race",
barmode='group', text='Percent')
dt_sex = dt['sex'].value_counts()
sex = pd.DataFrame(data=dt_sex.index, columns=["sex"])
sex['values'] = dt_sex.values
fig = px.pie(sex, values='values', names='sex', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
histogram(dt,"sex","class",'class on sex','sex','Count')
dt_coun = dt['native-country'].value_counts()
country = pd.DataFrame(data=dt_coun.index, columns=["native-country"])
country['values'] = dt_coun.values
fig = px.pie(country, values='values', names='native-country', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()
histogram(dt,"native-country","class",'class on native-country','native-country','Count')
df_edu=dt.groupby('class')['native-country'].value_counts(normalize=True)
df_edu = df_edu.mul(100).rename('Percent').reset_index()
df_edu['Percent']=df_edu['Percent'].round(decimals=2)
df_edu.head(10)
px.bar(df_edu, x='class', y='Percent', color='native-country', title="class w.r.t native-country",
barmode='group', text='Percent')
Sex=pd.get_dummies(dt.sex,drop_first=True,prefix='Sex')
dt=pd.concat([dt,Sex],axis=1)
dt.drop('sex',axis=1,inplace=True)
dt.head()
dt.workclass.unique()
ordinal_label=dt.groupby(['workclass'])['class'].count().sort_values().index
ordinal_label
list(enumerate(ordinal_label))
ordinal_labels2={k:i for i,k in enumerate(ordinal_label,0)}
ordinal_labels2
dt['workclass_map']=dt['workclass'].map(ordinal_labels2)
dt.drop('workclass',axis=1,inplace=True)
dt.head()
dt.education.unique()
ordinal_label=dt.groupby(['education'])['class'].count().sort_values().index
ordinal_label
list(enumerate(ordinal_label))
ordinal_labels2={k:i for i,k in enumerate(ordinal_label,0)}
ordinal_labels2
dt['education_map']=dt['education'].map(ordinal_labels2)
dt.drop('education',axis=1,inplace=True)
dt.head()
dt['marital-status'].unique()
ordinal_label=dt.groupby(['marital-status'])['class'].count().sort_values().index
ordinal_label
list(enumerate(ordinal_label))
ordinal_labels2={k:i for i,k in enumerate(ordinal_label,0)}
ordinal_labels2
dt['marital_map']=dt['marital-status'].map(ordinal_labels2)
dt.drop('marital-status',axis=1,inplace=True)
dt.head()
dt['occupation']=dt['occupation'].replace({'?':'Unknown'})
dt['occupation'].unique()
ordinal_label=dt.groupby(['occupation'])['class'].count().sort_values().index
ordinal_label
list(enumerate(ordinal_label))
ordinal_labels2={k:i for i,k in enumerate(ordinal_label,0)}
ordinal_labels2
dt['occupation_map']=dt['occupation'].map(ordinal_labels2)
dt.drop('occupation',axis=1,inplace=True)
dt.head()
dt['relationship'].unique()
ordinal_label=dt.groupby(['relationship'])['class'].count().sort_values().index
ordinal_label
list(enumerate(ordinal_label))
ordinal_labels2={k:i for i,k in enumerate(ordinal_label,0)}
ordinal_labels2
dt['relationship_map']=dt['relationship'].map(ordinal_labels2)
dt.drop('relationship',axis=1,inplace=True)
dt.head()
dt.race.unique()
ordinal_label=dt.groupby(['race'])['class'].count().sort_values().index
ordinal_label
list(enumerate(ordinal_label))
ordinal_labels2={k:i for i,k in enumerate(ordinal_label,0)}
ordinal_labels2
dt['race_map']=dt['race'].map(ordinal_labels2)
dt.drop('race',axis=1,inplace=True)
dt.head()
dt['native-country'].unique()
ordinal_label=dt.groupby(['native-country'])['class'].count().sort_values().index
ordinal_label
list(enumerate(ordinal_label))
ordinal_labels2={k:i for i,k in enumerate(ordinal_label,0)}
ordinal_labels2
dt['native-country_map']=dt['native-country'].map(ordinal_labels2)
dt.drop('native-country',axis=1,inplace=True)
dt.head()
dt.corr()
plt.figure(figsize=(20,20))
corr=dt.corr()
sns.heatmap(corr,annot=True,cmap=plt.cm.CMRmap_r)
plt.show()
def correlation_feature(dataset, threshold):
col_corr = set() # Set of all the names of correlated columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
col_corr.add(colname)
return col_corr
corr_features = correlation_feature(dt, 0.5)
len(set(corr_features))
corr_features
zerovar=dt.var()[dt.var()==0].index.values
zerovar
plt.figure(figsize=(8,8))
sns.countplot('class',data=dt)
plt.show()
x=dt.drop('class',axis=1)
y=dt['class']
#from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
smk = SMOTETomek()
X_res,y_res=smk.fit_resample(x,y)
plt.figure(figsize=(8,8))
sns.countplot(y_res,data=dt)
plt.show()
x_train,x_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.3,random_state=42)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)
log_pred=log_reg.predict(x_test)
cm1=confusion_matrix(y_test,log_pred)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_test,log_pred))
print(classification_report(y_test,log_pred))
model_rand=RandomForestClassifier()
model_rand.fit(x_train,y_train)
model_rand_test=model_rand.predict(x_test)
cm1=confusion_matrix(y_test,model_rand_test)
sns.heatmap(cm1,annot=True,fmt='d')
print('Accuracy Score:',accuracy_score(y_test,model_rand_test))
print(classification_report(y_test,model_rand_test))
cl=pd.get_dummies(dt['class'],drop_first=True,prefix='Class')
dt=pd.concat([dt,cl],axis=1)
dt.drop('class',axis=1,inplace=True)
dt.head()
min_dt=dt.min()
range_dt=(dt-min_dt).max()
dt_scaled = (dt-min_dt)/range_dt
dt_scaled.head()
plt.figure(figsize=(8,8))
sns.countplot('Class_>50K',data=dt_scaled)
plt.show()
x=dt_scaled.drop('Class_>50K',axis=1)
y=dt_scaled['Class_>50K']
smk = SMOTETomek()
X_res,y_res=smk.fit_resample(x,y)
plt.figure(figsize=(8,8))
sns.countplot(y_res,data=dt_scaled)
plt.show()
x_train,x_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.3,random_state=42)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)
log_pred=log_reg.predict(x_test)
cm1=confusion_matrix(y_test,log_pred)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_test,log_pred))
print(classification_report(y_test,log_pred))
model_rand=RandomForestClassifier()
model_rand.fit(x_train,y_train)
model_rand_test=model_rand.predict(x_test)
cm1=confusion_matrix(y_test,model_rand_test)
sns.heatmap(cm1,annot=True,fmt='d')
print('Accuracy Score:',accuracy_score(y_test,model_rand_test))
print(classification_report(y_test,model_rand_test))